import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import sys
from p5lib import *
import pickle
import os.path
%matplotlib inline
pickle_file = 'reduced_data.pickle'
if os.path.isfile(pickle_file):
data = pickle.load(open(pickle_file, "rb"))
else:
data = pd.read_csv('MERGED2013_PP.csv')
data = preprocess_data(data)
pickle.dump(data, open(pickle_file, "wb"))
print data.shape
# There is not a single college that has all the data for all features
print "Num of rows that has all the features: {}".format(len(data.dropna()))
print "-----"
print_num_data_for_each_features(data)
The completion data is more spread out between 0 and 1 while retention data is more concentrated in 0.5 - 1 range, with a lot of outliers below 0.3.
print "--- Completion ---"
print data.C150.describe()
print "--- Retention ---"
print data.RET_FT.describe()
plt.title("Statistical analysis of Completion and Retention data")
plt.boxplot([data.C150, data.RET_FT], labels=['Completion', 'Retention'])
plt.show()
plt.scatter(data['RET_FT'], data['C150'], c='b', s=2, linewidths=0)
plt.title('Retention VS Completion rate')
plt.xlabel("Retention")
plt.ylabel("Completion")
plt.show()
For each feature we will show the statistical analyis (mean, median, std dev), and use whisker plot to see the distribution. To see if the selected potential features have any correlation with the Completion and Retention, we will plot each features against Completion and Retention. In addition we will also use different colors to indicate different type of college.
For plotting purpose, split data based on:
This will allow us to plot the different college types with different colors
data['L4_COLLEGE'] = data['L4_COLLEGE'].astype(int)
data_L4 = data[data['L4_COLLEGE'] == 1]
data_4 = data[data['L4_COLLEGE'] == 0]
data_control1 = data[data.CONTROL == 1] # public
data_control2 = data[data.CONTROL == 2] # private non-profit
data_control3 = data[data.CONTROL == 3] # private profit
data_for_plotting = {
'data': data,
'data_L4': data_L4,
'data_4': data_4,
'data_control1': data_control1,
'data_control2': data_control2,
'data_control3': data_control3
}
# --- group similar features together ---
faculty_cols = ['AVGFACSAL', 'PFTFAC', 'ADM_RATE_ALL']
stu_score_cols = ['ACTCMMID', 'SAT_AVG', 'SAT_AVG_ALL', 'SATVRMID', 'SATMTMID', 'SATWRMID']
cost_cols = ['COSTT4_A', 'COSTT4_P', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'TUITIONFEE_PROG', 'TUITFTE',
'INEXPFTE', 'NPT4']
grant_loan_cols = ['PCTPELL', 'PCTFLOAN']
stu_composition_cols = ['NUM4', 'UG25abv', 'PFTFTUG1_EF', 'UGDS', 'PAR_ED_PCT_1STGEN',
'PAR_ED_PCT_MS', 'PAR_ED_PCT_HS',
'PAR_ED_PCT_PS', 'DEP_INC_AVG', 'IND_INC_AVG']
debt_cols = ['DEBT_MDN', 'GRAD_DEBT_MDN', 'WDRAW_DEBT_MDN']
for col in faculty_cols:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
There are some correlations for Full time faculty rate but it's pretty weak. For Admission rate, there is a nice descending line between 0 and 0.4, but once the admission rate is above 0.4 it starts to vary widely (for both completion and retention).
for col in ['CCSIZSET', 'CCUGPROF', 'CCBASIC', 'LOCALE', 'region', 'PREDDEG']:
plot_feature_vs_completion([col], data_for_plotting, categorical=True)
plot_feature_vs_retention([col], data_for_plotting)
Some carnegie classfications have higher completion and retention, so there is some correlation.
"Degree of urbanization", "Region", and "Predominant degree awarded" do not have much correlation with the completion rate. So these features are a good candidate to be excluded from building the model.
for col in stu_score_cols:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
There is a nice correlation between SAT and ACT score. The higher SAT / ACT score, the higher completion and retention is. The subject SAT score (math, writing, reading) follows the same SAT pattern, so we can just use the average SAT for building our model.
for col in cost_cols[:5]:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
There is an interesting similar correlation for "Average cost academic year" and "In state tuition". There is almost a straight positive linear correlation for "Out of state tuition".
Not very interesting correlation for Average cost program year and Tuition fee program year.
for col in cost_cols[5:]:
plot_feature_vs_completion([col], data_for_plotting, xscale='log')
plot_feature_vs_retention([col], data_for_plotting, xscale='log')
There is some correlation on Expense per FTE student. Not so much for Net revenue per FTE student and Average net price Title IV.
for col in grant_loan_cols:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
# print stu_composition_cols
plot_feature_vs_completion(['NUM4'], data_for_plotting, xscale='log')
plot_feature_vs_retention(['NUM4'], data_for_plotting, xscale='log')
plot_feature_vs_completion(['UGDS'], data_for_plotting, xscale='log')
plot_feature_vs_retention(['UGDS'], data_for_plotting, xscale='log')
for col in ['UG25abv', 'PFTFTUG1_EF']:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
# print stu_composition_cols
for col in stu_composition_cols[5:10]:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
There is a good correlation between Percentage of parent education post secondary and completion rate, and the inverse for Percentage of parent education high school. Percentage of parent education middle school does not have much correlation with completion rate, so that’s another candidate feature to drop.
for col in debt_cols:
plot_feature_vs_completion([col], data_for_plotting)
plot_feature_vs_retention([col], data_for_plotting)
There is some correlation between debt and completion rate, but it's not a clear linear correlation. It's more like a few clusters with gaussian distribution for each college type (private, public, profit).